{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"authorship_tag":"ABX9TyOpOyMiqfbbOTtWOqB0p5aB"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"r_NGZ_LM39L9","executionInfo":{"status":"ok","timestamp":1746695359450,"user_tz":-480,"elapsed":8460,"user":{"displayName":"Wen Qu","userId":"17890693673485394938"}},"outputId":"ee7176cd-55ce-4d3b-ae73-a3b6eba920d1"},"outputs":[{"output_type":"stream","name":"stdout","text":["Requirement already satisfied: openai in /usr/local/lib/python3.11/dist-packages (1.76.2)\n","Requirement already satisfied: anyio<5,>=3.5.0 in /usr/local/lib/python3.11/dist-packages (from openai) (4.9.0)\n","Requirement already satisfied: distro<2,>=1.7.0 in /usr/local/lib/python3.11/dist-packages (from openai) (1.9.0)\n","Requirement already satisfied: httpx<1,>=0.23.0 in /usr/local/lib/python3.11/dist-packages (from openai) (0.28.1)\n","Requirement already satisfied: jiter<1,>=0.4.0 in /usr/local/lib/python3.11/dist-packages (from openai) (0.9.0)\n","Requirement already satisfied: pydantic<3,>=1.9.0 in /usr/local/lib/python3.11/dist-packages (from openai) (2.11.4)\n","Requirement already satisfied: sniffio in /usr/local/lib/python3.11/dist-packages (from openai) (1.3.1)\n","Requirement already satisfied: tqdm>4 in /usr/local/lib/python3.11/dist-packages (from openai) (4.67.1)\n","Requirement already satisfied: typing-extensions<5,>=4.11 in /usr/local/lib/python3.11/dist-packages (from openai) (4.13.2)\n","Requirement already satisfied: idna>=2.8 in /usr/local/lib/python3.11/dist-packages (from anyio<5,>=3.5.0->openai) (3.10)\n","Requirement already satisfied: certifi in /usr/local/lib/python3.11/dist-packages (from httpx<1,>=0.23.0->openai) (2025.4.26)\n","Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.11/dist-packages (from httpx<1,>=0.23.0->openai) (1.0.9)\n","Requirement already satisfied: h11>=0.16 in /usr/local/lib/python3.11/dist-packages (from httpcore==1.*->httpx<1,>=0.23.0->openai) (0.16.0)\n","Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.11/dist-packages (from pydantic<3,>=1.9.0->openai) (0.7.0)\n","Requirement already satisfied: pydantic-core==2.33.2 in /usr/local/lib/python3.11/dist-packages (from pydantic<3,>=1.9.0->openai) (2.33.2)\n","Requirement already satisfied: typing-inspection>=0.4.0 in /usr/local/lib/python3.11/dist-packages (from pydantic<3,>=1.9.0->openai) (0.4.0)\n"]}],"source":["!pip install openai"]},{"cell_type":"code","source":["!gdown 16FSh45xge5RybunR6-4tER8Vi_pXXx4a"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"lQzBPX2g4uHw","executionInfo":{"status":"ok","timestamp":1746695364774,"user_tz":-480,"elapsed":4674,"user":{"displayName":"Wen Qu","userId":"17890693673485394938"}},"outputId":"e758a965-9fc4-48aa-ad22-170af970fb6a"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Downloading...\n","From: https://drive.google.com/uc?id=16FSh45xge5RybunR6-4tER8Vi_pXXx4a\n","To: /content/German_French_UK_China_MENA_040420.csv\n","\r  0% 0.00/2.88M [00:00<?, ?B/s]\r100% 2.88M/2.88M [00:00<00:00, 53.4MB/s]\n"]}]},{"cell_type":"code","source":["label2num = {\"Monitoring environmental impact\": 1,\n","              \"Preventing pollution\": 2,\n","              \"Strengthening ecosystems\": 3,\n","              \"Reducing use\": 4,\n","              \"Reusing\": 5,\n","              \"Recycling\": 6,\n","              \"Repurposing\":7,\n","              \"Encouraging and supporting others\": 8,\n","              \"Educating and training for sustainability\": 9,\n","              \"Creating sustainable products and processes\": 10,\n","              \"Embracing innovation for sustainability\": 11,\n","              \"Changing how work is done\": 12,\n","              \"Choosing responsible alternatives\": 13,\n","              \"Instituting programs and policies\": 14,\n","              \"Others\":15}\n","\n","for key, val in label2num.items():\n","  label2num[key] = val-1\n","\n","def datasetclearning(df):\n","  # Select two relevant columns from the original dataset\n","  df = df[['Major_Industry','description_behavior', 'consequences','Specific_cat']]\n","  df['description_behavior'] = df['description_behavior']+' '+df['consequences']\n","  # Collapse the data for the NLP paper\n","  other = [ \"Putting environmental interests first\", \"Lobbying and activism\"]\n","  # Collapse \"Putting environmental interest first\" and \"Lobbying and Activism\" into \"the Other\" category\n","  replacement = df['Specific_cat'].where(df['Specific_cat'].isin(other) == False, \"Others\")\n","  df = pd.concat([df['description_behavior'], replacement], axis =1)\n","  num2label = {y:x for x,y in label2num.items()}\n","  # Convert text label to number numeric\n","  df_label = df['Specific_cat'].map(label2num)\n","  df_label.name = 'label'\n","  df = pd.concat([df,df_label], axis = 1)\n","  # Retain only the behaviors and numberic label\n","  df = df[['description_behavior','label']]\n","  # Rename the columns of output dataframe\n","  df.columns = ['text', 'label']\n","  df['text'] = df['text'].str.replace(\"\\n \", \" \")\n","  # Return the dataframe\n","  return [df, num2label]"],"metadata":{"id":"_nV0_ErY4xHt"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["import openai\n","import pandas as pd\n","\n","from openai import OpenAI\n","from sklearn.model_selection import train_test_split\n","\n","df = pd.read_csv(\"/content/German_French_UK_China_MENA_040420.csv\")\n","df, num2label = datasetclearning(df)\n","\n","def test_train_split(df, seed):\n","  X_train, X_test, Y_train, Y_test, indices_train, indices_test = train_test_split(df['text'], df['label'], df.index, test_size=2449, random_state = seed, stratify = df['label'])\n","  return [list(X_train), list(X_test), list(Y_train), list(Y_test)]\n","\n","def data_prep(df, seed):\n","  # 1. Test-train split (Test set has 2449 cases across simulation conditions)\n","  X_train, X_test, Y_train, Y_test = test_train_split(df, seed)\n","  return X_train, X_test, Y_train, Y_test"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"sLFjVyig4xxp","executionInfo":{"status":"ok","timestamp":1746695401770,"user_tz":-480,"elapsed":4861,"user":{"displayName":"Wen Qu","userId":"17890693673485394938"}},"outputId":"911b1b80-ee8b-486b-9a43-bd8a12ed7dbf"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stderr","text":["<ipython-input-3-2814818bdd97>:23: SettingWithCopyWarning: \n","A value is trying to be set on a copy of a slice from a DataFrame.\n","Try using .loc[row_indexer,col_indexer] = value instead\n","\n","See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n","  df['description_behavior'] = df['description_behavior']+' '+df['consequences']\n"]}]},{"cell_type":"code","source":["import random\n","def create_fewshot_examples(X_train, Y_train, num_examples_per_class=1, random_seed=11):\n","    random.seed(random_seed)\n","    fewshot_examples = \"\"\n","    for label_text, label_num in label2num.items():\n","        # Select a few examples from the training data for this label\n","        label_indices = [i for i, x in enumerate(Y_train) if x == label_num]\n","        selected_indices = random.sample(label_indices, num_examples_per_class)\n","        examples = [X_train[i] for i in selected_indices]\n","\n","        for example in examples:\n","            fewshot_examples += f\"###{example}###\\n{label_text}\\n\\n\"\n","\n","    return fewshot_examples"],"metadata":{"id":"mDV2ISZW48mg"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["#import re\n","#import json\n","prompting_0 = f\"We have these following classes: [{', '.join(label2num.keys())}]\"\n","\n","\n","client = OpenAI(\n","    api_key=\"your key\",\n",")\n","\n","def ask_p0(content, fewshot_examples):\n","\n","  prompt_start = prompting_0 + \"\\nHere are some examples:\\n\" + fewshot_examples + f\"\"\"\\nNow please classify the following content into one of the above classes:\n","\n","  ###\n","  {content}\n","  ###\n","  \"\"\" # Include fewshot examples\n","\n","  prompt_end = \"\"\"\n","  Return the predicted classes ONLY. Do not include any other kind of output.\n","  \"\"\"\n","  # Construct the prompt with all contents\n","  prompt = prompt_start+prompt_end\n","\n","  #print(\"Prompt sent to the model:\")\n","  #print(prompt)\n","\n","  completion = client.chat.completions.create(\n","      model=\"gpt-4o\",\n","      messages = [\n","       {\"role\": \"system\", \"content\": \"You are a classification assistant that STRICTLY follows instructions.\"},\n","       {\"role\": \"user\", \"content\": prompt}\n","   ]\n","  )\n","\n","  response_content = completion.choices[0].message.content\n","\n","  #print(response_content)\n","  #print(predicted_classes)\n","\n","  return response_content"],"metadata":{"id":"GWD_lCBl5C84"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["import time\n","from sklearn.metrics import classification_report\n","from collections import defaultdict\n","\n","p0_precisions = []\n","p0_recalls = []\n","p0_f1s = []\n","\n","\n","for seed in range(1, 6):\n","  start_time = time.time()\n","  print(f\"current seed: {seed}\")\n","  X_train, X_test, Y_train, Y_test = data_prep(df, seed)\n","\n","  fewshot_examples= create_fewshot_examples(X_train, Y_train)\n","\n","  predictions_p0 = []\n","  for i in range(len(X_test)):\n","    if i % 100 == 0:\n","      print(f\"current index: {i}\")\n","    predicted_class = ask_p0(X_test[i],fewshot_examples)\n","    number_failures = 1\n","    while predicted_class not in label2num.keys():\n","      print(predicted_class, f\"index = {i}, f = {number_failures}, try again\")\n","      number_failures += 1\n","      predicted_class = ask_p0(X_test[i],fewshot_examples)\n","    predictions_p0.append(label2num[predicted_class])\n","\n","  metrics_p0 = classification_report(Y_test, predictions_p0[:2449], output_dict = True)\n","  p0_precisions.append(metrics_p0['weighted avg'].get('precision'))\n","  p0_recalls.append(metrics_p0['weighted avg'].get('recall'))\n","  p0_f1s.append(metrics_p0['weighted avg'].get('f1-score'))\n","  end_time = time.time()\n","  duration_1round = end_time - start_time\n","  print(f\"Duration of {seed} round: {duration_1round} seconds\")\n","\n","  import numpy as np\n","\n","print(p0_precisions, p0_recalls, p0_f1s)\n","\n","print(\"mean precision p0\", np.mean(p0_precisions))\n","print(\"mean recall p0\", np.mean(p0_recalls))\n","print(\"mean f1 p0\", np.mean(p0_f1s))"],"metadata":{"id":"zvCSn7Fv5G-8"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["import numpy as np\n","\n","print(p0_precisions, p0_recalls, p0_f1s)\n","\n","print(\"mean precision p0\", np.mean(p0_precisions))\n","print(\"mean recall p0\", np.mean(p0_recalls))\n","print(\"mean f1 p0\", np.mean(p0_f1s))"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"cgKFi_fY5OiS","executionInfo":{"status":"ok","timestamp":1746707240446,"user_tz":-480,"elapsed":321,"user":{"displayName":"Wen Qu","userId":"17890693673485394938"}},"outputId":"055ee718-2478-4c9f-82c6-211c39157573"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["[0.4794281810199533, 0.5010628840761446, 0.49376887249115037, 0.5003240247719517, 0.46073712398465255] [0.4650877909350755, 0.48019599836668025, 0.4748877092690894, 0.4887709269089424, 0.4663127807268273] [0.46083694697088784, 0.4688863688534532, 0.4679988117633821, 0.47884278061551305, 0.4469737760400176]\n","mean precision p0 0.4870642172687705\n","mean recall p0 0.475051041241323\n","mean f1 p0 0.46470773684865074\n"]}]}]}